#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Load packages
library(tidyverse)
library(survey)
library(tidycensus)
library(here)
library(kableExtra)

#load survey data
g <- readRDS(here("data", "inter", "fairsurveys", "fairsurvey_2022.rds"))

#load census data----
acs.vars <- tidycensus::load_variables(2021, "acs5")
#sex by age by education
SexAgeEdu <- subset(acs.vars, concept == "SEX BY AGE BY EDUCATIONAL ATTAINMENT FOR THE POPULATION 18 YEARS AND OVER")
SexAgeEdu <- SexAgeEdu[stringr::str_count(SexAgeEdu$label, "!") == 8, ]
#household income
income <- subset(acs.vars, concept == "HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2021 INFLATION-ADJUSTED DOLLARS)")
income <- income[-1,]
#sex by age by white
RaceAgeSex <- subset(acs.vars, concept %in% c("SEX BY AGE (WHITE ALONE)", "SEX BY AGE (BLACK OR AFRICAN AMERICAN ALONE)",
                                              "SEX BY AGE (AMERICAN INDIAN AND ALASKA NATIVE ALONE)", "SEX BY AGE (ASIAN ALONE)",
                                              "SEX BY AGE (NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER ALONE)", "SEX BY AGE (SOME OTHER RACE ALONE)",
                                              "SEX BY AGE (TWO OR MORE RACES)"))
RaceAgeSex <- RaceAgeSex[stringr::str_count(RaceAgeSex$label, "!")==6,]
##B01003_001==total population
#"B25100_002", "B25100_008"
getvars <- c("B02001_002", "B01003_001", SexAgeEdu$name, income$name, RaceAgeSex$name)
#download census data
acs_long <- tidycensus::get_acs("county", county = "Greene County", state = "PA", variables = getvars, year = 2018, sumfile = "acs5")
#joint distribution of sex,age,and education
acs.SexAgeEdu <- subset(acs_long, variable %in% SexAgeEdu$name)
acs.SexAgeEdu <- merge(acs.SexAgeEdu, SexAgeEdu, by.x = "variable", by.y = "name")
joint <-
  acs.SexAgeEdu %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs", "edu"), sep = "!!")
joint <- subset(joint, select = -c(concept, estimate2, total, geography, NAME, GEOID))
joint <-
  joint %>%
  dplyr::mutate(
    edu_acs = dplyr::case_when(
      edu %in% c("Less than 9th grade", "9th to 12th grade, no diploma", "High school graduate (includes equivalency)") ~ "No college",
      edu %in% c("Associate's degree", "Some college, no degree", "Bachelor's degree", "Graduate or professional degree") ~ "College"
    )
  )
joint <-
  joint %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
joint$Freq <- joint$estimate / sum(joint$estimate)
joint$estimate <- NULL
#joint$Freq <- joint$Freq * nrow(d)
#joint distribution of race, age, and sex
totalpop <- subset(acs_long, variable == "B01003_001")
acs.RaceAgeSex <- subset(acs_long, variable %in% RaceAgeSex$name)
acs.RaceAgeSex <- merge(acs.RaceAgeSex, RaceAgeSex, by.x = "variable", by.y = "name")
race.dist <-
  acs.RaceAgeSex %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "gender", "age_acs"), sep = "!!") %>%
  dplyr::select(-c(variable, GEOID, NAME, moe, estimate2, total, geography))
race.dist$concept <- ifelse(race.dist$concept == "SEX BY AGE (WHITE ALONE)", "white", "nonwhite")
names(race.dist)[4] <- "race_acs"
##adjust age
race.dist <-
  race.dist %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 and 19 years", "20 to 24 years") ~ "18 to 24 years",
      age_acs %in% c("25 to 29 years", "30 to 34 years") ~ "25 to 34 years",
      age_acs %in% c("35 to 44 years") ~ "35 to 44 years",
      age_acs %in% c("45 to 54 years", "55 to 64 years") ~ "45 to 64 years",
      age_acs %in% c("65 to 74 years", "75 to 84 years", "85 years and over") ~ "65 years and over",
      T ~ NA_character_
    )
  )
race.dist <- subset(race.dist, !is.na(age_acs))
race.dist <-
  race.dist %>%
  dplyr::group_by(gender, age_acs, race_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
race.dist$Freq <- race.dist$estimate / sum(race.dist$estimate)
#race.dist$Freq <- race.dist$Freq * nrow(d)
race.dist$estimate <- NULL
#prepare income distribution
acs.income <- subset(acs_long, variable %in% income$name)
acs.income <- merge(acs.income, income, by.x = "variable", by.y = "name")
income.dist <-
  acs.income %>%
  tidyr::separate(col = label, into = c("estimate2", "total", "income"), sep = "!!")
income.dist <- subset(income.dist, select = -c(concept, estimate2, total, geography, NAME, GEOID))
income.dist <-
  income.dist %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income %in% c("Less than $10,000", "$10,000 to $14,999", "$15,000 to $19,999") ~ "Less than $20,000",
      income %in% c("$20,000 to $24,999", "$25,000 to $29,999", "$30,000 to $34,999", "$35,000 to $39,999") ~ "$20,000 to $39,999",
      income %in% c("$40,000 to $44,999", "$45,000 to $49,999", "$50,000 to $59,999") ~ "$40,000 to $59,999",
      income %in% c("$60,000 to $74,999", "$75,000 to $99,999") ~ "$60,000 - $99,999",
      income %in% c("$100,000 to $124,999", "$125,000 to $149,999", "$150,000 to $199,999", "$200,000 or more") ~ "$100,000 or more",
      T ~ NA_character_
    )
  )
income.dist <-
  income.dist %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarise(estimate = sum(estimate))
income.dist$Freq <- income.dist$estimate / sum(income.dist$estimate)
income.dist$estimate <- NULL
income.dist

#prep survey data for age variable
#prep race variable
g$race_acs <- g$race
table(g$race_acs)
#prep gender variable
g$gender <- ifelse(g$Gender == 1, "Female", "Male")

#prepare survey income distribution
g <-
  g %>%
  dplyr::mutate(
    income_acs = dplyr::case_when(
      income_acs %in% c("Less than $10,000", "$10,000 - $19,999") ~ "Less than $20,000",
      income_acs %in% c("$20,000 - $29,999", "$30,000 - $39,999") ~ "$20,000 to $39,999",
      income_acs %in% c("$40,000 - $49,999", "$50,000 - $59,999") ~ "$40,000 to $59,999",
      income_acs %in% c("$60,000 to $79,999", "$80,000 to $99,999") ~ "$60,000 - $99,999",
      income_acs %in% c("$100,000 - $149,999", "$150,000 - $199,999", "$200,000 or more") ~ "$100,000 or more",
      T ~ as.character(income_acs)
    )
  )
#fix names
joint$gender <- gsub(":","",joint$gender)
joint$age_acs <- gsub(":","",joint$age_acs)
race.dist$gender <- gsub(":","",race.dist$gender)
race.dist$age_acs <- gsub(":","",race.dist$age_acs)

#check joint distribution in sample----
#gender x age x education
d <-
  g %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., joint, by = c("gender", "age_acs", "edu_acs")) %>%
  dplyr::mutate(
    age_acs = dplyr::case_when(
      age_acs %in% c("18 to 24 years", "25 to 34 years") ~ "18-34 years",
      age_acs %in% c("35 to 44 years", "45 to 64 years") ~ "35-64 years",
      age_acs == "65 years and over" ~ "$>$65 years"
    )
  ) %>%
  dplyr::group_by(gender, age_acs, edu_acs) %>%
  dplyr::summarise(across(n:Freq, ~ sum(.x)))

d$var <- with(d, paste(gender, age_acs, edu_acs, sep = " $\\times$ "))
d <- d[, c(9, 6, 7, 8)]
d <- d[c(3:6, 1, 2, 9:12, 7,8), ]

#income
income.sample <-
  g %>%
  dplyr::group_by(income_acs) %>%
  dplyr::summarize(
    n = n(),
    n_weight = sum(weights)
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    prop = n / sum(n),
    prop_weight = n_weight / sum(n_weight)
  ) %>%
  dplyr::left_join(., income.dist, by = "income_acs")
names(income.sample)[1] <- "var"
income.sample <- income.sample[c(5, 2, 3, 4, 1), c(1, 4, 5, 6)]
income.sample$var <- c("$<$\\$20,000", "\\$20,000-39,999", "\\$40,000-59,999", "\\$60,000-99,999", "$>$\\$100,000")
#race
g$white <- ifelse(g$nonwhite == 0 , 1, 0)
prop.white <- with(g, mean(white))
prop.white.w <- with(g, mean(white * weights))
pop.white <- race.dist %>%
  dplyr::group_by(race_acs) %>%
  dplyr::summarise(Freq = sum(Freq)) %>%
  slice(2)
r <- data.frame(var = "White", prop = prop.white, prop_weight = prop.white.w, Freq = pop.white$Freq)
#bind together
sample.dist <- rbind(d, income.sample, r)
names(sample.dist) <- c("", "Sample", "Weighted", "Population")
sample.dist <- sample.dist[, c(1, 2, 4, 3)]
#create table for shale shock paper
fair22 <- kableExtra::kbl(sample.dist[-c(19:20),], booktabs = TRUE, digits = 2, linesep = "", align = "lccc", format = "latex", escape = FALSE,
                  position = "t") %>%
  kableExtra::kable_styling(position = "center", font = 10.25) %>%
  kableExtra::pack_rows(escape = FALSE, "Sex/Age/Education", 1, 12) %>%
  kableExtra::pack_rows("Income", 13, 17) %>%
  kableExtra::pack_rows("Race", 18, 18)
tabout <- here("output", "tables", "si_tab_fair2022.txt")
cat(fair22, file = tabout)
fair22 <- readLines(tabout, warn = FALSE)
fair22 <- fair22[-c(1,2,3,34,35)]
cat(fair22, file = tabout)